# heading ----
rm(list = setdiff(ls(), c(lsf.str(), ls.str(mode = "character"))))
Sys.setenv(LANG = "en")
message(paste0("Date: ", format(Sys.Date(), "%d %B %Y")),
        "\n",
        paste0("By: ", "Cedric Chambru & Paul Maneuvrier-Hervieu"),
        "\n",
        paste0("Description: ", "code - create figures on HiSCoD"),
        "\n",
        paste0("Version of R used: ", R.Version()$version.string))


# load data: HiSCoD ----
df <- readRDS(paste0(dir_data, "db_hiscod_rds_v1_en.rds"))


# create variable: decade and century ----
df <- df %>%
  time_var(., year)


# Figure 1: Geographical distribution of social conflicts in HiSCoD ----
file_name = paste("fig_1", "bar_plot_n_obs_country_1000_1900", sep = "_")
title_plot <- NULL
subtitle_plot <- NULL
caption_plot <- NULL
x_plot <- NULL
y_plot <- "Number of conflicts"
x_scale <- NULL
y_scale <- seq(from = 0, to = 20000, by = 5000)
title_legend <- NULL

df %>%
	# select data: drop observations for which country is unknown
	tidyr::drop_na(
		country_name
		) %>%
	# recode variable: country_name
	dplyr::mutate(
		country_name = dplyr::case_when(
			is.na(country_name) ~ "Missing information",
			.default = country_name)
		) %>%
	# total number of conflicts per country
	dplyr::summarise(
		tot_obs = dplyr::n(),
		.by = country_name
		) %>%
	# select data: retain only country with more than 25 observations
	dplyr::filter(
		tot_obs > 25
		) %>%
	# plot
	ggplot2::ggplot(
		data = .
		) +
	ggplot2::geom_bar(
		ggplot2::aes(
			x = reorder(country_name, tot_obs),
			y = tot_obs
			),
		fill = "#999999",
		stat = "identity"
		) +
	ggplot2::geom_text(
		ggplot2::aes(
			x = country_name,
			y = tot_obs, label = tot_obs
			),
		size = 11,
		hjust = -0.25
		) +
	ggplot2::scale_y_continuous(
		breaks = y_scale,
		limits = c(min(y_scale) - 1, max(y_scale) + 1)
		) +
	# add labels for title, x-axis, and y-axis
	labs_plot() +
	# rotate plot
	ggplot2::coord_flip() +
	# define theme for producing bar plot
	theme_minimal(
		base_size = 40
		) +
	theme_bar_plot()

# export plot
ggplot2::ggsave(
	filename = paste0(dir_figures, file_name, ".png"),
	scale = 2,
	width = 10,
	height = 7,
	dpi = 300,
	device = "png"
)


# Figure 2: Number of social conflicts per quarter century ----
file_name = paste("fig_2", "bar_plot_n_25_years_obs_conflict_1000_1900", sep = "_")
title_plot <- NULL
subtitle_plot <- NULL
caption_plot <- NULL
x_plot <- "Decade"
y_plot <- "Number of conflicts"
x_scale <- seq(from = 1000, to = 1900, by = 100)
y_scale <- NULL
title_legend <- NULL

df %>%
	# select data: drop observations for which exact timing is unknown
	tidyr::drop_na(
  	quarter_century
  	) %>%
  # total number of conflicts per decade
	dplyr::summarise(
		tot_obs = dplyr::n(),
		.by = quarter_century
		) %>%
  # plot
	ggplot2::ggplot(
		data = .
		) +
  ggplot2::geom_bar(
  	ggplot2::aes(
  		x = quarter_century,
  		y = tot_obs
  		),
  	fill = "#999999",
  	stat = "identity"
  	) +
  ggplot2::geom_text(
  	ggplot2::aes(
  		x = quarter_century,
  		y = tot_obs,
  		label = tot_obs
  		),
  	size = 7.5,
  	vjust = -0.85
  	) +
	ggplot2::scale_x_continuous(
		breaks = x_scale,
		limits = c(min(x_scale) - 1, max(x_scale) + 1)
		) +
	# add labels for title, x-axis, and y-axis
	labs_plot() +
	# define theme for producing bar plot
	theme_minimal(
		base_size = 40
		) +
	theme_bar_plot()

# export plot
ggplot2::ggsave(
	filename = paste0(dir_figures, file_name, ".png"),
	scale = 2,
	width = 10,
	height = 7,
	dpi = 300,
	device = "png"
	)


# Figure 3: Available information for individual social conflicts ----
## compute descriptive statistics ----
## number of participants ----
df_participants <- df %>%
  dplyr::filter(
  	nb_participants != "Unknown"
  	) %>%
	dplyr::mutate(
		var_type = "Number of participants"
		) %>%
	# total number of conflicts with information on the number of participants
	dplyr::summarise(
		n = dplyr::n(),
		.by = var_type
	)


## women participation ----
df_women <- df %>%
	dplyr::filter(
		women_participation != "Unknown"
		) %>%
	dplyr::mutate(
		var_type = "Participation of women"
		) %>%
	# total number of conflicts with information on the participation of women
	dplyr::summarise(
		n = dplyr::n(),
		.by = var_type
	)


## description > 5 words ----
df_desc <- df %>%
	dplyr::mutate(
		n_word_fr = sapply(
			X = base::strsplit(x = description_event, split = " "),
			FUN = length
			),
		n_word_en = sapply(
			X = base::strsplit(x = description_event_english, split = " "),
			FUN = length
			)
		) %>%
	dplyr::rowwise(.) %>%
	dplyr::mutate(
		n_word = max(
			n_word_fr,
			n_word_en,
			na.rm = TRUE
			)
		) %>%
	dplyr::ungroup(.) %>%
  dplyr::select(
  	id_riot_hiscod,
  	tidyselect::contains("description"),
  	tidyselect::contains("word")
  	) %>%
  # drop short description: less than 5 words
	tidyr::drop_na(
		n_word
	) %>%
	dplyr::filter(
		n_word > 5
		) %>%
	dplyr::mutate(
		var_type = "Description of events"
		) %>%
	# total number of conflicts with at least 5 words in the description
	dplyr::summarise(
		n = dplyr::n(),
		.by = var_type
	)


## archival sources ----
df_sources <- df %>%
	dplyr::filter(
		stringr::str_detect(
			string = id_riot_original_database,
			pattern = "JN"
			)
		& !is.na(
			primary_sources_2
			)
		| !stringr::str_detect(
			string = id_riot_original_database,
			pattern = "JN"
		)
		& !is.na(
			primary_sources_1
		)
	) %>%
	dplyr::mutate(
		var_type = "Archival signatures"
		) %>%
	# total number of conflicts with at least one archival signature
	dplyr::summarise(
		n = dplyr::n(),
		.by = var_type
	)


## bibliographic references ----
df_biblio <- df %>%
	dplyr::filter(
		!is.na(bibliography_1)
		& !is.na(bibliography_2)
		) %>%
	dplyr::mutate(
		var_type = "Bibliographic references"
		) %>%
	# total number of conflicts with at least one bibliographic reference
	dplyr::summarise(
		n = dplyr::n(),
		.by = var_type
	)

df_url <- df %>%
	dplyr::select(
		id_riot_hiscod,
		tidyselect::contains("url")
	) %>%
	tidyr::pivot_longer(
		data = .,
		cols = tidyselect::contains("url"),
		names_to = "var_types",
		values_to = "var_values",
		values_drop_na = TRUE
	)


## bind data ----
df_sum <- dplyr::bind_rows(
		df_participants,
		df_women,
		df_desc,
		df_biblio,
		df_sources
		) %>%
	dplyr::mutate(
		var_order = dplyr::case_when(
			var_type == "Number of participants" ~ 5,
			var_type == "Participation of women" ~ 4,
			var_type == "Description of events" ~ 3,
			var_type == "Archival signatures" ~ 2,
			var_type == "Bibliographic references" ~ 1
			)
	) %>%
	dplyr::rename(
		tot_obs = n
	)


## plot figure ----
file_name = paste("fig_3", "bar_plot_n_obs_information", sep = "_")
title_plot <- NULL
subtitle_plot <- NULL
caption_plot <- NULL
x_plot <- NULL
y_plot <- "Number of entries"
x_scale <- NULL
y_scale <- seq(from = 0, to = 20000, by = 5000)
title_legend <- NULL

df_sum %>%
  # plot
  ggplot2::ggplot(
  	data = .
  	) +
  ggplot2::geom_bar(
  	ggplot2::aes(
  		x = reorder(var_type, var_order),
  		y = tot_obs
  		),
  	fill = "#999999",
  	stat = "identity"
  	) +
  ggplot2::geom_text(
  	ggplot2::aes(
  		x = var_type,
  		y = tot_obs,
  		label = tot_obs),
  	size = 11,
  	hjust = -0.25) +
	ggplot2::scale_y_continuous(
  	breaks = y_scale,
  	limits = c(min(y_scale) - 1, max(y_scale) + 1)
  	) +
	# add labels for title, x-axis, and y-axis
	labs_plot() +
	# rotate plot
	ggplot2::coord_flip() +
	# define theme for producing bar plot
	theme_minimal(
		base_size = 40
	) +
	theme_bar_plot()

# export plot
ggplot2::ggsave(
	filename = paste0(dir_figures, file_name, ".png"),
	scale = 2,
	width = 10,
	height = 7,
	dpi = 300,
	device = "png"
)


# clean workspace ----
rm(list = setdiff(ls(), c(lsf.str(), ls.str(mode = "character"))))
file.remove(".Rhistory")

